library(dplyr)
library(plotly)
library(ggplot2)
Is there an association between the percentage of people whose income is below the poverty line and the incidence rate 1) across neighborhoods in NYC and 2) across neighborhoods in each borough?
data_clean <- read.csv("data_final.csv")
# List all column names
names(data_clean)
## [1] "INCIDENT_KEY" "OCCUR_DATE"
## [3] "OCCUR_TIME" "BORO"
## [5] "LOC_OF_OCCUR_DESC" "PRECINCT"
## [7] "JURISDICTION_CODE" "LOC_CLASSFCTN_DESC"
## [9] "LOCATION_DESC" "STATISTICAL_MURDER_FLAG"
## [11] "PERP_AGE_GROUP" "PERP_SEX"
## [13] "PERP_RACE" "VIC_AGE_GROUP"
## [15] "VIC_SEX" "VIC_RACE"
## [17] "X_COORD_CD" "Y_COORD_CD"
## [19] "Latitude" "Longitude"
## [21] "Lon_Lat" "Neighborhood"
## [23] "neighbourhood_group" "NTA"
## [25] "NTA_2010" "Is_Holiday"
## [27] "Year" "Month"
## [29] "OCCUR_DATETIME" "Sky_Is_Dark"
## [31] "NTAType" "Total_population_nta"
## [33] "CDTA" "Number_poverty"
## [35] "Percent_poverty" "Number_education"
## [37] "Percent_education" "incident_rate_by_year_nta"
## [39] "total_population_boro" "incident_rate_by_year_boro"
# Check the structure of the data frame
str(data_clean)
## 'data.frame': 9820 obs. of 40 variables:
## $ INCIDENT_KEY : int 244608249 247542571 202853370 230311078 229224142 231246224 228559720 238210279 233431365 238238212 ...
## $ OCCUR_DATE : chr "2022-05-05" "2022-07-04" "2019-09-24" "2021-07-01" ...
## $ OCCUR_TIME : chr "00:10:00" "22:20:00" "21:00:00" "23:07:00" ...
## $ BORO : chr "MANHATTAN" "BRONX" "BRONX" "MANHATTAN" ...
## $ LOC_OF_OCCUR_DESC : chr "INSIDE" "OUTSIDE" NA NA ...
## $ PRECINCT : int 14 48 42 23 113 77 48 49 73 114 ...
## $ JURISDICTION_CODE : int 0 0 0 2 0 0 0 0 0 0 ...
## $ LOC_CLASSFCTN_DESC : chr "COMMERCIAL" "STREET" NA NA ...
## $ LOCATION_DESC : chr "VIDEO STORE" "(null)" NA "MULTI DWELL - PUBLIC HOUS" ...
## $ STATISTICAL_MURDER_FLAG : logi TRUE TRUE FALSE FALSE TRUE FALSE ...
## $ PERP_AGE_GROUP : chr "25-44" "(null)" "25-44" NA ...
## $ PERP_SEX : chr "M" "(null)" "M" NA ...
## $ PERP_RACE : chr "BLACK" "(null)" "UNKNOWN" NA ...
## $ VIC_AGE_GROUP : chr "25-44" "18-24" "25-44" "25-44" ...
## $ VIC_SEX : chr "M" "M" "M" "M" ...
## $ VIC_RACE : chr "BLACK" "BLACK" "BLACK" "BLACK" ...
## $ X_COORD_CD : num 986050 1016802 1014493 999061 1042534 ...
## $ Y_COORD_CD : num 214231 250581 242565 229912 184647 ...
## $ Latitude : num 40.8 40.9 40.8 40.8 40.7 ...
## $ Longitude : num -74 -73.9 -73.9 -73.9 -73.8 ...
## $ Lon_Lat : chr "POINT (-73.9935 40.754692)" "POINT (-73.88233 40.854402)" "POINT (-73.89071440599997 40.832416753000075)" "POINT (-73.94650786199998 40.79772716600007)" ...
## $ Neighborhood : chr "Hell's Kitchen" "Belmont" "East Morrisania" "East Harlem" ...
## $ neighbourhood_group : chr "Manhattan" "Bronx" "Bronx" "Manhattan" ...
## $ NTA : chr "Chelsea-Hudson Yards" "Belmont" "Crotona Park East" "East Harlem (North)" ...
## $ NTA_2010 : chr "Hudson Yards-Chelsea-Flatiron-Union Square" "Belmont" "Crotona Park East" "East Harlem North" ...
## $ Is_Holiday : logi FALSE TRUE FALSE FALSE FALSE FALSE ...
## $ Year : int 2022 2022 2019 2021 2021 2021 2021 2021 2021 2021 ...
## $ Month : int 5 7 9 7 6 7 5 12 9 12 ...
## $ OCCUR_DATETIME : chr "2022-05-05 00:10:00" "2022-07-04 22:20:00" "2019-09-24 21:00:00" "2021-07-01 23:07:00" ...
## $ Sky_Is_Dark : logi TRUE TRUE TRUE TRUE FALSE TRUE ...
## $ NTAType : chr "Residential" "Residential" "Residential" "Residential" ...
## $ Total_population_nta : int 69741 35825 30158 64655 43090 15 35825 34623 37952 50225 ...
## $ CDTA : chr "MN 04" "BX 06" "BX 03" "MN 11" ...
## $ Number_poverty : chr "7,121" "12,919" "8,519" "20,588" ...
## $ Percent_poverty : num 11.5 39.6 29.4 32.3 11.4 NA 39.6 24.6 27.3 9.4 ...
## $ Number_education : chr "49,919" "12,455" "13,149" "34,290" ...
## $ Percent_education : num 94.6 65.6 69 75.9 83.5 NA 65.6 75.7 83.1 88.4 ...
## $ incident_rate_by_year_nta : num 0.0272 0.0809 0.0365 0.0619 0.0278 ...
## $ total_population_boro : int 18415085 22955825 11660890 20085354 13629328 38087730 29778638 29778638 38087730 13629328 ...
## $ incident_rate_by_year_boro: num 0.00167 0.00233 0.00229 0.00171 0.00217 ...
# Step 1: Check the structure of your dataset to ensure the necessary columns exist
head(data_clean)
## INCIDENT_KEY OCCUR_DATE OCCUR_TIME BORO LOC_OF_OCCUR_DESC PRECINCT
## 1 244608249 2022-05-05 00:10:00 MANHATTAN INSIDE 14
## 2 247542571 2022-07-04 22:20:00 BRONX OUTSIDE 48
## 3 202853370 2019-09-24 21:00:00 BRONX <NA> 42
## 4 230311078 2021-07-01 23:07:00 MANHATTAN <NA> 23
## 5 229224142 2021-06-07 19:55:00 QUEENS <NA> 113
## 6 231246224 2021-07-22 01:47:00 BROOKLYN <NA> 77
## JURISDICTION_CODE LOC_CLASSFCTN_DESC LOCATION_DESC
## 1 0 COMMERCIAL VIDEO STORE
## 2 0 STREET (null)
## 3 0 <NA> <NA>
## 4 2 <NA> MULTI DWELL - PUBLIC HOUS
## 5 0 <NA> <NA>
## 6 0 <NA> MULTI DWELL - APT BUILD
## STATISTICAL_MURDER_FLAG PERP_AGE_GROUP PERP_SEX PERP_RACE VIC_AGE_GROUP
## 1 TRUE 25-44 M BLACK 25-44
## 2 TRUE (null) (null) (null) 18-24
## 3 FALSE 25-44 M UNKNOWN 25-44
## 4 FALSE <NA> <NA> <NA> 25-44
## 5 TRUE <NA> <NA> <NA> 45-64
## 6 FALSE <NA> <NA> <NA> 25-44
## VIC_SEX VIC_RACE X_COORD_CD Y_COORD_CD Latitude Longitude
## 1 M BLACK 986050 214231 40.75469 -73.99350
## 2 M BLACK 1016802 250581 40.85440 -73.88233
## 3 M BLACK 1014493 242565 40.83242 -73.89071
## 4 M BLACK 999061 229912 40.79773 -73.94651
## 5 M BLACK 1042534 184647 40.67331 -73.78989
## 6 M BLACK 1004507 182865 40.66858 -73.92698
## Lon_Lat Neighborhood
## 1 POINT (-73.9935 40.754692) Hell's Kitchen
## 2 POINT (-73.88233 40.854402) Belmont
## 3 POINT (-73.89071440599997 40.832416753000075) East Morrisania
## 4 POINT (-73.94650786199998 40.79772716600007) East Harlem
## 5 POINT (-73.78988688199998 40.673306465000046) Jamaica
## 6 POINT (-73.92697993199994 40.66858395700007) Crown Heights
## neighbourhood_group NTA
## 1 Manhattan Chelsea-Hudson Yards
## 2 Bronx Belmont
## 3 Bronx Crotona Park East
## 4 Manhattan East Harlem (North)
## 5 Queens Baisley Park
## 6 Brooklyn Lincoln Terrace Park
## NTA_2010 Is_Holiday Year Month
## 1 Hudson Yards-Chelsea-Flatiron-Union Square FALSE 2022 5
## 2 Belmont TRUE 2022 7
## 3 Crotona Park East FALSE 2019 9
## 4 East Harlem North FALSE 2021 7
## 5 Baisley Park FALSE 2021 6
## 6 Crown Heights North FALSE 2021 7
## OCCUR_DATETIME Sky_Is_Dark NTAType Total_population_nta CDTA
## 1 2022-05-05 00:10:00 TRUE Residential 69741 MN 04
## 2 2022-07-04 22:20:00 TRUE Residential 35825 BX 06
## 3 2019-09-24 21:00:00 TRUE Residential 30158 BX 03
## 4 2021-07-01 23:07:00 TRUE Residential 64655 MN 11
## 5 2021-06-07 19:55:00 FALSE Residential 43090 QN 12
## 6 2021-07-22 01:47:00 TRUE Park 15 BK 08
## Number_poverty Percent_poverty Number_education Percent_education
## 1 7,121 11.5 49,919 94.6
## 2 12,919 39.6 12,455 65.6
## 3 8,519 29.4 13,149 69.0
## 4 20,588 32.3 34,290 75.9
## 5 5,054 11.4 25,416 83.5
## 6 <NA> NA <NA> NA
## incident_rate_by_year_nta total_population_boro incident_rate_by_year_boro
## 1 0.02724366 18415085 0.001667112
## 2 0.08094906 22955825 0.002330563
## 3 0.03647457 11660890 0.002289705
## 4 0.06186683 20085354 0.001707712
## 5 0.02784869 13629328 0.002171787
## 6 13.33333333 38087730 0.001656702
# Calculate the correlation between the poverty percentage and the incident rate
correlation <- cor(data_clean$incident_rate_by_year_nta, data_clean$Percent_poverty, use = "complete.obs")
print(paste("Correlation coefficient: ", correlation))
## [1] "Correlation coefficient: 0.508408410575774"
When examining all of the neighborhoods in NYC, there is a moderate positive linear relationship between the percentage of people below the poverty line (Percent_poverty) and the incident rate by neighborhood (incident_rate_by_year_nta).(r = 0.508).
# Create a scatter plot to visualize the relationship
data_clean %>%
plot_ly(x = ~Percent_poverty, y = ~incident_rate_by_year_nta,
color = ~NTA, colors = "viridis",
type = "scatter", mode = "markers",
text = ~paste("Neighborhood: ", NTA, "<br>Borough: ", BORO,
"<br>% Below Poverty Line: ", Percent_poverty,
"<br>Incident Rate: ", incident_rate_by_year_nta)) %>%
layout(title = "Percent Below the Poverty Line and Incident Rate in NYC",
xaxis = list(title = 'Percentage of People Whose Income is Below the Poverty Line'),
yaxis = list(title = 'Incident Rate'),
legend = list(title = list(text = 'Neighborhood')))
## Warning: Ignoring 95 observations
this plot suggests that neighborhoods with higher poverty rates tend to have higher incident rates.
# Assuming your main data frame is named 'data_clean'
# Filter data for Manhattan
manhattan_data <- data_clean %>%
filter(neighbourhood_group == "Manhattan")
# Filter data for Brooklyn
brooklyn_data <- data_clean %>%
filter(neighbourhood_group == "Brooklyn")
# Filter data for The Bronx
bronx_data <- data_clean %>%
filter(neighbourhood_group == "Bronx")
# Filter data for Staten Island
staten_island_data <- data_clean %>%
filter(neighbourhood_group == "Staten Island")
# Filter data for Queens
queens_data <- data_clean %>%
filter(neighbourhood_group == "Queens")
# Function to compute and print correlation
compute_correlation <- function(data, borough_name) {
correlation <- cor(
data$incident_rate_by_year_nta,
data$Percent_poverty,
use = "complete.obs"
)
cat("Correlation coefficient for", borough_name, ":", correlation, "\n")
}
compute_correlation(manhattan_data, "Manhattan")
## Correlation coefficient for Manhattan : 0.3344739
compute_correlation(brooklyn_data, "Brooklyn")
## Correlation coefficient for Brooklyn : 0.5686026
compute_correlation(bronx_data, "Bronx")
## Correlation coefficient for Bronx : 0.4462688
compute_correlation(staten_island_data, "Staten Island")
## Correlation coefficient for Staten Island : 0.5575556
compute_correlation(queens_data, "Queens")
## Correlation coefficient for Queens : 0.2797095
compute_correlation(manhattan_data, "Manhattan")
## Correlation coefficient for Manhattan : 0.3344739
# Function to create scatter plot with trend line
# Scatter plot for Manhattan
data_clean |>
filter(neighbourhood_group == "Manhattan") |>
plot_ly(data = _, x = ~Percent_poverty, y = ~incident_rate_by_year_nta,
color = ~NTA,
colors = "viridis",
type = "scatter",
mode = "markers",
text = ~paste("Neighborhood: ", NTA, "<br>Borough: ", neighbourhood_group,
"<br>% Below Poverty Line: ", Percent_poverty,
"<br>Incident Rate: ", incident_rate_by_year_nta)) |>
layout(title = "Percent Below the Poverty Line and Incident Rate in Manhattan",
xaxis = list(title = 'Percentage of People Whose Income is Below the Poverty Line'),
yaxis = list(title = 'Incident Rate'),
legend = list(title = list(text = 'Neighborhood')))
## Warning: Ignoring 6 observations
Some neighborhoods, like East Harlem (North), East Harlem (South), and Chinatown-Two Bridges, seem to have both higher poverty percentages and higher incident rates, suggesting a more direct connection between poverty and incidents.
# Scatter plot for Brooklyn
data_clean |>
filter(neighbourhood_group == "Brooklyn") |>
plot_ly(data = _, x = ~Percent_poverty, y = ~incident_rate_by_year_nta,
color = ~NTA,
colors = "plasma",
type = "scatter",
mode = "markers",
text = ~paste("Neighborhood: ", NTA, "<br>Borough: ", neighbourhood_group,
"<br>% Below Poverty Line: ", Percent_poverty,
"<br>Incident Rate: ", incident_rate_by_year_nta)) |>
layout(title = "Percent Below the Poverty Line and Incident Rate in Brooklyn",
xaxis = list(title = 'Percentage of People Whose Income is Below the Poverty Line'),
yaxis = list(title = 'Incident Rate'),
legend = list(title = list(text = 'Neighborhood')))
## Warning: Ignoring 13 observations
Brownsville, which has a high poverty rate (between 30-50%), also has some of the highest incident rates in Brooklyn, indicating that poverty may significantly impact incident rates in this neighborhood.
# Scatter plot for The Bronx
data_clean |>
filter(neighbourhood_group == "Bronx") |>
plot_ly(data = _, x = ~Percent_poverty, y = ~incident_rate_by_year_nta,
color = ~NTA,
colors = "magma",
type = "scatter",
mode = "markers",
text = ~paste("Neighborhood: ", NTA, "<br>Borough: ", neighbourhood_group,
"<br>% Below Poverty Line: ", Percent_poverty,
"<br>Incident Rate: ", incident_rate_by_year_nta)) |>
layout(title = "Percent Below the Poverty Line and Incident Rate in The Bronx",
xaxis = list(title = 'Percentage of People Whose Income is Below the Poverty Line'),
yaxis = list(title = 'Incident Rate'),
legend = list(title = list(text = 'Neighborhood')))
## Warning: Ignoring 25 observations
Claremont Village-Claremont (East) and Fordham Heights seem to have higher incident rates compared to other neighborhoods with similar poverty levels
# Scatter plot for Queens
data_clean |>
filter(neighbourhood_group == "Queens") |>
plot_ly(data = _, x = ~Percent_poverty, y = ~incident_rate_by_year_nta,
color = ~NTA,
colors = "inferno",
type = "scatter",
mode = "markers",
text = ~paste("Neighborhood: ", NTA, "<br>Borough: ", neighbourhood_group,
"<br>% Below Poverty Line: ", Percent_poverty,
"<br>Incident Rate: ", incident_rate_by_year_nta)) |>
layout(title = "Percent Below the Poverty Line and Incident Rate in Queens",
xaxis = list(title = 'Percentage of People Whose Income is Below the Poverty Line'),
yaxis = list(title = 'Incident Rate'),
legend = list(title = list(text = 'Neighborhood')))
## Warning: Ignoring 11 observations
Corona and East Elmhurst show relatively high incident rates, particularly compared to neighborhoods with similar poverty percentages.
# Scatter plot for Staten Island
data_clean |>
filter(neighbourhood_group == "Staten Island") |>
plot_ly(data = _, x = ~Percent_poverty, y = ~incident_rate_by_year_nta,
color = ~NTA,
colors = "inferno",
type = "scatter",
mode = "markers",
text = ~paste("Neighborhood: ", NTA, "<br>Borough: ", neighbourhood_group,
"<br>% Below Poverty Line: ", Percent_poverty,
"<br>Incident Rate: ", incident_rate_by_year_nta)) |>
layout(title = "Percent Below the Poverty Line and Incident Rate in Staten Island",
xaxis = list(title = 'Percentage of People Whose Income is Below the Poverty Line'),
yaxis = list(title = 'Incident Rate'),
legend = list(title = list(text = 'Neighborhood')))
## Warning: Ignoring 5 observations
In Staten Island, the relationship between poverty and incident rates is less pronounced compared to other boroughs.
# Load necessary libraries
library(dplyr)
library(plotly)
# 1. Load your incidents data
incidents_data <- read.csv("data_final.csv")
# 2. Filter data from 2017 to 2023
incidents_filtered <- incidents_data %>%
filter(Year >= 2017 & Year <= 2023)
# 3. Aggregate total incidents by CDTA
incidents_by_cdta <- incidents_filtered %>%
group_by(CDTA) %>%
summarise(
total_incidents = n(),
Longitude = mean(Longitude, na.rm = TRUE),
Latitude = mean(Latitude, na.rm = TRUE)
) %>%
ungroup()
# 4. Define initial breaks from 0 to 400 with intervals of 5
initial_breaks <- seq(0, 400, by = 5)
# 5. Calculate the maximum number of incidents
max_incidents <- max(incidents_by_cdta$total_incidents, na.rm = TRUE)
# 6. Check if maximum incidents exceed 400 and adjust breaks and labels accordingly
if (max_incidents > 400) {
# Extend breaks to include all incidents > 400
extended_breaks <- c(initial_breaks, Inf)
# Create labels for all intervals, including the last one for incidents >400
labels <- c(
paste(initial_breaks[-length(initial_breaks)], initial_breaks[-1], sep = "-"),
paste0("401+", max_incidents)
)
} else {
# No need to extend breaks
extended_breaks <- initial_breaks
labels <- paste(initial_breaks[-length(initial_breaks)], initial_breaks[-1], sep = "-")
}
# 7. Validate that the number of labels matches the number of intervals
num_intervals <- length(extended_breaks) - 1
num_labels <- length(labels)
if (num_intervals != num_labels) {
stop("Number of labels does not match the number of intervals.")
}
# 8. Assign incident ranges
incidents_by_cdta <- incidents_by_cdta %>%
mutate(
incident_range = cut(
total_incidents,
breaks = extended_breaks,
include.lowest = TRUE,
right = FALSE, # Left-inclusive intervals [a, b)
labels = labels
)
)
# 9. Verify the distribution of incident ranges
print(table(incidents_by_cdta$incident_range, useNA = "ifany"))
##
## 0-5 5-10 10-15 15-20 20-25 25-30 30-35 35-40 40-45 45-50
## 6 5 3 2 4 0 0 3 1 2
## 50-55 55-60 60-65 65-70 70-75 75-80 80-85 85-90 90-95 95-100
## 1 2 1 1 2 0 0 2 1 0
## 100-105 105-110 110-115 115-120 120-125 125-130 130-135 135-140 140-145 145-150
## 0 0 1 1 1 0 1 2 2 0
## 150-155 155-160 160-165 165-170 170-175 175-180 180-185 185-190 190-195 195-200
## 0 0 1 0 2 0 0 0 1 1
## 200-205 205-210 210-215 215-220 220-225 225-230 230-235 235-240 240-245 245-250
## 0 0 0 0 2 0 0 0 0 0
## 250-255 255-260 260-265 265-270 270-275 275-280 280-285 285-290 290-295 295-300
## 2 0 1 1 0 0 0 0 0 0
## 300-305 305-310 310-315 315-320 320-325 325-330 330-335 335-340 340-345 345-350
## 0 0 0 0 1 1 0 0 0 0
## 350-355 355-360 360-365 365-370 370-375 375-380 380-385 385-390 390-395 395-400
## 0 2 0 1 0 1 1 1 0 0
## 401+551
## 5
# 10. Check for missing coordinates and exclude them
missing_coords <- incidents_by_cdta %>%
filter(is.na(Longitude) | is.na(Latitude))
if (nrow(missing_coords) > 0) {
cat("Warning: The following CDTAs have missing coordinates and will be excluded from the plot:\n")
print(missing_coords$CDTA)
# Exclude CDTAs with missing coordinates
incidents_by_cdta <- incidents_by_cdta %>%
filter(!is.na(Longitude) & !is.na(Latitude))
}
# 11. Set your Mapbox access token
Sys.setenv('MAPBOX_TOKEN' = 'your_mapbox_access_token') # Replace with your actual token
# 12. Create the scatter map
incident_map <- plot_ly(
data = incidents_by_cdta,
type = "scattermapbox",
mode = "markers",
lon = ~Longitude, # Use exact column name with correct case
lat = ~Latitude, # Use exact column name with correct case
color = ~incident_range,
colors = "viridis", # Use lowercase if necessary
marker = list(
size = 8,
opacity = 0.7
),
text = ~paste(
"CDTA: ", CDTA,
"<br>Total Incidents: ", total_incidents,
"<br>Incident Range: ", incident_range
)
) %>%
layout(
title = "Total Number of Incidents Across NYC CDTAs (2017-2023)",
mapbox = list(
style = "carto-positron", # Choose desired map style
center = list(lon = -74.0060, lat = 40.7128), # Centered on NYC
zoom = 10
),
legend = list(title = list(text = 'Incident Range'))
)
# 13. Display the map
incident_map
# Load necessary libraries
library(dplyr)
library(tidyr)
library(plotly)
# 1. Load your incidents data
incidents_data <- read.csv("data_final.csv")
# 2. Filter data from 2017 to 2023 and remove rows with missing values in key columns
incidents_filtered <- incidents_data %>%
filter(Year >= 2017 & Year <= 2023) %>%
drop_na(Sky_Is_Dark, Longitude, Latitude, NTA, OCCUR_TIME) # Ensure essential columns have no NA
# Optional: Relabel Sky_Is_Dark for better readability
incidents_filtered <- incidents_filtered %>%
mutate(Sky_Condition = ifelse(Sky_Is_Dark, "Night", "Day"))
# 3. Set your Mapbox access token
Sys.setenv('MAPBOX_TOKEN' = 'your_mapbox_access_token') # Replace with your actual token
# 4. Create the interactive map using the filtered data
plot_ly(data = incidents_filtered,
type = "scattermapbox", # Specifies that we're creating a Mapbox scatter plot
mode = "markers", # Use markers to represent data points
lon = ~Longitude, # Longitude for x-axis (map)
lat = ~Latitude, # Latitude for y-axis (map)
color = ~Sky_Condition, # Color markers based on Sky_Condition
colors = c("Day" = "orange", "Night" = "blue"), # Assign specific colors
marker = list(size = 5, # Set marker size
opacity = 0.7), # Set marker opacity for better visibility
text = ~paste("NTA:", NTA,
"<br>Time of Incident:", OCCUR_TIME,
"<br>Sky Condition:", Sky_Condition) # Hover text
) |>
layout(
title = "DAY OR NIGHT Incidents in NYC", # Set the plot title
mapbox = list(
style = "carto-positron", # Map style
center = list(lon = -73.94, lat = 40.84), # Center the map on NYC
zoom = 12 # Set zoom level
),
showlegend = TRUE, # Display the legend
legend = list(title = list(text = 'Sky Condition')) # Legend title
)